#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import requests
from bs4 import BeautifulSoup

dest = "http://xyxxw.forestpolice.net"

def getnewslst():
    r = requests.get(dest+"/936/list.htm")
    htmldoc = r.text

    soup = BeautifulSoup(htmldoc, "html5lib")
    # print(soup.prettify())
    p = []
    for i in soup.find_all('a', target='_blank'):
        p.append(i.get('href'))
    return p
    
def getnewscontent():
    llist = getnewslst()
    # print llist
    ans = [] 
    for x in llist:
        if (os.path.splitext(x)[1] != '.htm'):
            continue
        r = requests.get(dest+x)
        htmldoc = r.text
        # print htmldoc
        soup = BeautifulSoup(htmldoc, 'html5lib')
        c = ''
        p = {'title':'', 'content':''}
        p['title'] = soup.find('td', class_='biaoti').get_text()
        # print p['title']
        clst = soup.find('table', class_='article').find_all('p')
        p_text = '' 
        for x in clst:
            p_text =  x.get_text().replace(u'\xa0', u'')
            if (p_text.replace(u'\n', u'') != ''):
                c = c + p_text + '\n'
        # print c
        p['content'] = c
        ans.append(p)
        # ans.append(p);
    return ans
    
if __name__ == '__main__':
    l = getnewscontent()
    for x in l:
        print x['title']
        print
        print x['content']